create_graph <- function(flows) {
  g <- tbl_graph(edges = flows, nodes = ccas) %>%
    activate(edges) %>%
    mutate(
      from_name = .N()$name[from],
      to_name = .N()$name[to]
    ) %>%
    activate(nodes) %>%
    # Compute centrality measures
    mutate(
      in_degree = centrality_degree(weights = n, mode = "in"),
      out_degree = centrality_degree(weights = n, mode = "out"),
      eigen = centrality_eigen(weights = n, directed = TRUE, scale = TRUE)
    )
}

g2002 <- create_graph(cca_flows_2002)
g2022 <- create_graph(cca_flows_2022)
# Get the commuting network for the given year
g <- function(year) {
  if (year == 2002) {
    g2002
  } else if (year == 2022) {
    g2022
  } else {
    stop("Only years allowed are 2002 and 2022")
  }
}

get_sf <- function(year) {
  g(year) %>%
    activate(nodes) %>%
    as_tibble() %>%
    st_as_sf()
}
# Number of nodes
g2022 %>% vcount()
## [1] 77
# Number of non-zero edges
g2022 %E>% filter(n > 0) %>% ecount()
## [1] 5656

Compute entropy measures. igraph implements entropy with the diversity() function, but only for undirected graphs, so we’ll have to implement in- and out-entropy ourselves.

# A generic function to compute the in- or out-entropy value for a single node,
# given a vector of its in- or out-commuting flows.
entropy <- function(values, n = length(values)) {
  normed <- values / sum(values)
  # 0 values don't contribute to the sum
  -sum(ifelse(normed == 0, 0, normed * log2(normed))) / log2(n)
}

# Tests. These all return TRUE.
# entropy(c(5, 5, 5), 3) == 1
# entropy(c(0,0,100), 3) == 0
# entropy(c(100), 3) == 0
# Add in-entropy and out-entropy to the network objects!
add_in_entropy <- function(g) {
  if ("in_entropy" %in% colnames(g %N>% as_tibble())) {
    warning("In-entropy has already been computed. Skipping.")
    return(g)
  }

  g %>%
    activate(nodes) %>%
    mutate(
      in_entropy = map_dbl(1:vcount(g), \(node) {
        # Get all in-commuting flows to this node
        g %>%
          activate(edges) %>%
          filter(to == node) %>%
          pull(n) %>%
          # Maximum number of edges to this node is 1 less than total nodes
          entropy(vcount(g) - 1)
      })
    )
}

add_out_entropy <- function(g) {
  if ("out_entropy" %in% colnames(g %N>% as_tibble())) {
    warning("Out-entropy has already been computed. Skipping.")
    return(g)
  }

  g %>%
    activate(nodes) %>%
    mutate(
      out_entropy = map_dbl(1:vcount(g), \(node) {
        # Get all out-commuting flows from this node
        g %>%
          activate(edges) %>%
          filter(from == node) %>%
          pull(n) %>%
          # Maximum number of edges from this node is 1 less than total nodes
          entropy(vcount(g) - 1)
      })
    )
}

g2002 <- g2002 %>% add_in_entropy() %>% add_out_entropy()
g2022 <- g2022 %>% add_in_entropy() %>% add_out_entropy()

What proportion of Chicago workers commute from outside Chicago?

I use these figures in my data section.

tract_flows <- read_csv("data/tract_flows_2022.csv", col_types = "cci")
tract_to_cca <- tracts %>% as_tibble() %>% select(GEOID, cca)
# The cca_flows data frames don't include anything outside Chicago, so we have
# to do these joins again.
tract_flows %>%
  left_join(tract_to_cca, by = join_by(w_tract == GEOID)) %>%
  rename(w_cca = cca) %>%
  left_join(tract_to_cca, by = join_by(h_tract == GEOID)) %>%
  rename(h_cca = cca) %>%
  mutate(works_in_chicago = !is.na(w_cca), lives_in_chicago = !is.na(h_cca)) %>%
  group_by(works_in_chicago, lives_in_chicago) %>%
  summarize(n = sum(n), .groups = "drop")
## # A tibble: 4 × 3
##   works_in_chicago lives_in_chicago       n
##   <lgl>            <lgl>              <int>
## 1 FALSE            FALSE            3783860
## 2 FALSE            TRUE              394784
## 3 TRUE             FALSE             613740
## 4 TRUE             TRUE              740302

Out-degree

map_out_degree <- map_by("out_degree", "Out-commuting flow", "Inter-CCA Out-commuting", tm_scale_continuous_log(values = "brewer.blues", limits = c(500, 40000)))

map_out_degree(2002)

map_out_degree(2022)

combined %>%
  mutate(
    change = out_degree_2022 - out_degree_2002,
    pct_change = out_degree_2022 / out_degree_2002 - 1
  ) %>%
  pct_change_map("Out-degree % change")

Eigenvector centrality

map_eigen <- map_by("eigen", "Eigenvector centrality", "Eigenvector Centrality",
  tm_scale_continuous_log(values = "viridis", limits = c(0.0001, 1)))
map_eigen(2002)

map_eigen(2022)

combined %>%
  mutate(pct_change = eigen_2022 / eigen_2002 - 1) %>%
  pct_change_map("Percent change")

# Not included
cca_graph %N>%
  as_tibble() %>%
  mutate(area = if_else(
    st_centroid(geometry) %>% st_coordinates() %>% .[, 2] >=
      st_centroid(filter(ccas, name == "Near West Side")$geometry) %>%
        st_coordinates() %>%
        .[, 2],
    "Northern",
    "Southern"
  )) %>%
  arrange(desc(eig_centrality)) %>%
  ggplot(aes(x = distance_from_loop, y = eig_centrality)) +
  geom_point(aes(color = area)) +
  geom_smooth(se = FALSE, method = "lm", size = 0.5, color = "black", formula = y ~ x) +
  # geom_smooth(aes(color = area), se = FALSE, method = "lm", size = 0.5) +
  geom_text_repel(aes(label = name), size = 2) +
  scale_y_log10() +
  scale_color_brewer(palette = "Set1") +
  theme_classic() +
  labs(
    title = "Eigenvector Centrality vs. Distance from the Loop",
    subtitle = '"Northern" neighborhoods are those that are at least as far north as the Near West Side.',
    x = "Distance from the Loop (m)",
    y = "Eigenvector Centrality",
    color = "Area"
  )

In-entropy

map_in_entropy <- map_by("in_entropy", "In-entropy", "In-entropy",
  tm_scale_continuous_log(values = "brewer.reds", limits = c(0.75, 1)))
map_in_entropy(2002)

map_in_entropy(2022)

raw_change_map <- function(df, legend_title) {
  df %>%
    tm_shape() +
    tm_polygons(
      fill = "change",
      fill.scale = tm_scale_continuous(
        values = "brewer.rd_bu", limits = c(-0.2, 0.2),
        labels = c("-0.2", "-0.1", "0", "+0.1", "+0.2"),
        midpoint = 0
      ),
      fill.legend = tm_legend(title = legend_title)
    )
}
combined %>%
  mutate(change = in_entropy_2022 - in_entropy_2002) %>%
  raw_change_map("In-entropy change")

Out-entropy

map_out_entropy <- map_by("out_entropy", "Out-entropy", "Out-entropy",
  tm_scale_continuous_log(values = "brewer.purples", limits = c(0.4, 0.9)))
map_out_entropy(2002)

map_out_entropy(2022)

combined %>%
  mutate(change = out_entropy_2022 - out_entropy_2002) %>%
  raw_change_map("Out-entropy change")

Table

df <- combined %>%
  as_tibble() %>%
  select(name, contains("_20")) %>%
  pivot_longer(
    contains("_20"),
    names_to = c("variable", "year"), values_to = "value",
    names_pattern = r"(^([a-z_]+)_(\d{4})$)"
  ) %>%
  # This is the order of the variables I've been using in my paper
  mutate(variable = factor(variable, levels = c("in_degree", "out_degree", "eigen", "in_entropy", "out_entropy"))) %>%
  # Unpivot the years, which we want in separate columns so we can take the difference
  pivot_wider(names_from = year, values_from = value, names_prefix = "value_") %>%
  mutate(change = value_2022 - value_2002, pct_change = (value_2022 / value_2002 - 1) * 100) %>%
  pivot_longer(
    c(value_2002, value_2022, change, pct_change),
    names_to = "subvariable", values_to = "value",
    names_transform = . %>% str_replace("value_", "")
  )

df %>%
  group_by(variable, subvariable) %>%
  summarize(mean = mean(value), sd = sd(value), se = sd / sqrt(77), med = median(value))
## `summarise()` has grouped output by 'variable'. You can override using the
## `.groups` argument.
## # A tibble: 20 × 6
## # Groups:   variable [5]
##    variable    subvariable       mean         sd         se        med
##    <fct>       <chr>            <dbl>      <dbl>      <dbl>      <dbl>
##  1 in_degree   2002        8437.      24079.     2744.      3128      
##  2 in_degree   2022        8811.      29752.     3391.      2472      
##  3 in_degree   change       374.       6407.      730.      -364      
##  4 in_degree   pct_change    -6.27       49.9       5.68     -16.3    
##  5 out_degree  2002        8437.       6180.      704.      6970      
##  6 out_degree  2022        8811.       7119.      811.      6490      
##  7 out_degree  change       374.       2407.      274.      -127      
##  8 out_degree  pct_change     6.61       39.3       4.48      -1.43   
##  9 eigen       2002           0.0361      0.123     0.0140     0.00991
## 10 eigen       2022           0.0336      0.128     0.0145     0.00608
## 11 eigen       change        -0.00251     0.0152    0.00174   -0.00292
## 12 eigen       pct_change   -23.8        53.4       6.09     -37.5    
## 13 in_entropy  2002           0.871       0.0357    0.00407    0.865  
## 14 in_entropy  2022           0.882       0.0373    0.00425    0.887  
## 15 in_entropy  change         0.0112      0.0274    0.00312    0.0110 
## 16 in_entropy  pct_change     1.33        3.22      0.367      1.22   
## 17 out_entropy 2002           0.692       0.0710    0.00809    0.709  
## 18 out_entropy 2022           0.638       0.0802    0.00914    0.661  
## 19 out_entropy change        -0.0543      0.0427    0.00486   -0.0477 
## 20 out_entropy pct_change    -7.88        6.40      0.729     -6.74
t.test(combined$in_degree_2022, combined$in_degree_2002, paired = TRUE)
## 
##  Paired t-test
## 
## data:  combined$in_degree_2022 and combined$in_degree_2002
## t = 0.51231, df = 76, p-value = 0.6099
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -1080.134  1828.238
## sample estimates:
## mean difference 
##        374.0519
t.test(combined$out_degree_2022, combined$out_degree_2002, paired = TRUE)
## 
##  Paired t-test
## 
## data:  combined$out_degree_2022 and combined$out_degree_2002
## t = 1.3639, df = 76, p-value = 0.1766
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -172.1816  920.2854
## sample estimates:
## mean difference 
##        374.0519
t.test(combined$eigen_2022, combined$eigen_2002, paired = TRUE)
## 
##  Paired t-test
## 
## data:  combined$eigen_2022 and combined$eigen_2002
## t = -1.4462, df = 76, p-value = 0.1522
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.0059705949  0.0009473825
## sample estimates:
## mean difference 
##    -0.002511606
t.test(combined$in_entropy_2022, combined$in_entropy_2002, paired = TRUE)
## 
##  Paired t-test
## 
## data:  combined$in_entropy_2022 and combined$in_entropy_2002
## t = 3.5885, df = 76, p-value = 0.0005863
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  0.004977708 0.017394526
## sample estimates:
## mean difference 
##      0.01118612
t.test(combined$out_entropy_2022, combined$out_entropy_2002, paired = TRUE)
## 
##  Paired t-test
## 
## data:  combined$out_entropy_2022 and combined$out_entropy_2002
## t = -11.172, df = 76, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.06401455 -0.04464320
## sample estimates:
## mean difference 
##     -0.05432887
ggplot(combined, aes(x = eigen_2002)) +
  geom_density()

Mean distance commuted

mean_distances <- g2002 %>%
  activate(edges) %>%
  as_tibble() %>%
  # cca_distances uses CCA names, not the same indexes as the graphs
  left_join(cca_distances, by = c("from_name" = "from", "to_name" = "to")) %>%
  group_by(from_name) %>%
  summarize(mean_distance = sum(n * distance) / sum(n)) %>%
  rename(from = "from_name")

g2002 %>%
  activate(nodes) %>%
  left_join(mean_distances, by = c(name = "from")) %>%
  as_tibble() %>%
  st_as_sf() %>%
  tm_shape() +
  tm_polygons(
    fill = "mean_distance",
    fill.scale = tm_scale_continuous(values = "viridis", limits = c(4000, 25000))
  )

mean_distances2 <- g2022 %>%
  activate(edges) %>%
  as_tibble() %>%
  # cca_distances uses CCA names, not the same indexes as the graphs
  left_join(cca_distances, by = c("from_name" = "from", "to_name" = "to")) %>%
  group_by(from_name) %>%
  summarize(mean_distance = sum(n * distance) / sum(n)) %>%
  rename(from = "from_name")

g2022 %>%
  activate(nodes) %>%
  left_join(mean_distances2, by = c(name = "from")) %>%
  as_tibble() %>%
  st_as_sf() %>%
  tm_shape() +
  tm_polygons(
    fill = "mean_distance",
    fill.scale = tm_scale_continuous(values = "viridis", limits = c(4000, 25000))
  )

K-cores

g(2002) %>%
  activate(edges) %>%
  filter(n > quantile(n, 0.75)) %>%
  activate(nodes) %>%
  mutate(coreness = node_coreness()) %>%
  as_tibble() %>%
  st_as_sf() %>%
  tm_shape() +
  tm_polygons(fill = "coreness", fill.scale = tm_scale_ordinal())

Structural equivalence

get_dend <- function(g) {
  adj <- as_adjacency_matrix(g, attr = "n", sparse = FALSE)
  # Group community areas with similar in *and* out commuting
  mtx_both <- rbind(adj, t(adj))

  # Perform agglomerative clustering based on pairwise Pearson's correlations
  hclust(as.dist(1 - cor(mtx_both), upper = FALSE), method = "ward.D")
}
add_structural_equiv <- function(g) {
  if ("cluster" %in% colnames(g %N>% as_tibble())) {
    g <- g %>% select(-cluster)
  }

  dendrogram <- get_dend(g)
  # plot(dendrogram)

  # Elbow plot shows that 5 clusters is a good choice for both years
  # heights <- dendrogram$height
  # num_clusters <- length(heights):1
  # tibble(height = heights, num_clusters = num_clusters) %>%
  #   filter(num_clusters <= 20) %>%
  #   ggplot(aes(x = num_clusters, y = height)) +
  #   geom_point() +
  #   geom_line()

  cluster_assignments <- cutree(dendrogram, k = 5) %>%
    tibble(cluster = ., name = names(cluster))
  # Add column to graph
  g %>%
    activate(nodes) %>%
    left_join(cluster_assignments, by = "name")
}

g2002 <- add_structural_equiv(g2002)
g2022 <- add_structural_equiv(g2022)

map_struct_equiv <- map_by("cluster", "Structural equivalence class", "Structural Equivalence Clusters", scale = tm_scale_categorical())
tmap_arrange(map_struct_equiv(2002), map_struct_equiv(2022))
## [plot mode] fit legend/component: Some legend items or map compoments do not
## fit well, and are therefore rescaled.
## ℹ Set the tmap option `component.autoscale = FALSE` to disable rescaling.
## [plot mode] fit legend/component: Some legend items or map compoments do not
## fit well, and are therefore rescaled.
## ℹ Set the tmap option `component.autoscale = FALSE` to disable rescaling.

plot(get_dend(g2002))

plot(get_dend(g2022))

Regular equivalence/blockmodeling

mat <- as_adjacency_matrix(g2002, attr = "n", sparse = FALSE)
rege <- REGE.nm.for(mat)$E
dend <- hclust(as.dist(1 - rege), method = "ward.D")

plot(dend)

plot.mat(mat, clu=cutree(dend, k=8))

Feedback 4/17/25

Old Ideas

Where people who live in a given CCA work?

One simple result we can draw from this dataset is simply where people who live in any given commuting area work. For example, below we compare the commuting flows for residents of Hyde Park and Woodlawn. Hyde Park and Woodlawn are adjacent, but because Hyde Park is home to the University of Chicago, most of its residents work there. Woodlawn, on the other hand, has a shortage of local jobs, so most Woodlawn residents commute to the Loop.

where_do_people_work <- function(cca) {
  cca_flows_2022 %>%
    filter(from == cca) %>%
    select(to, n) %>%
    arrange(desc(n))
}
where_do_people_work_plot <- function(cca) {
  # For debugging:
  # cca <- "Woodlawn"
  
  where_do_people_work(cca) %>%
    filter(to != "OUTSIDE CHICAGO") %>%
    mutate(is_source = to == cca) %>%
    # Put the source community last so that its borders are on top (plotted last)
    arrange(is_source) %>%
    # Must be left_join so that OUTSIDE CHICAGO is left out
    left_join(ccas, by = c("to" = "name")) %>%
    # If you want to use a log scale, try this
    # mutate(log_n = log10(n + 1)) %>%
    st_as_sf() %>%
    tm_shape() +
    tm_polygons(
      fill = "n",
      fill.legend = tm_legend(title = "Number of Workers"),
      col = "is_source",
      col.scale = tm_scale_categorical(values = c(`TRUE` = "red", `FALSE` = "lightblue", `NA` = "lightblue")),
      col.legend = tm_legend_hide(),
      lwd = "is_source",
      lwd.scale = tm_scale_categorical(values = c(`TRUE` = 2, `FALSE` = 0.5)),
      lwd.legend = tm_legend_hide(),
    ) +
    tm_title(
      glue("Where do {cca} residents work?"),
      size = 1.5
    )
}
where_do_people_work_plot("Woodlawn")
where_do_people_work_plot("Hyde Park")

Local jobs availability

This plot shows the number of local jobs per working resident in each community area. In neighborhoods with the largest employment, like O’Hare and the Loop, there are many more jobs than working residents. By contrast, many neighborhoods have nearly ten times as many working residents as jobs, so almost everyone must commute.

ccas <- ccas %>%
  mutate(jobs_availability = w_total / h_total)

ccas %>%
  # filter(name != "Loop", , name != "Ohare") %>%
  # mutate(pct_work_in_same = work_in_same / w_total * 100) %>%
  tm_shape() +
  tm_polygons(
    fill = "jobs_availability",
    fill.legend = tm_legend(title = "Jobs Availability"),
    fill.scale = tm_scale_continuous_log10(values = "viridis"),
  )

Distance and flow size

Intuitively, we would expect that commuting is higher between neighborhoods that are closer together.

cca_flows_2022 %>%
  filter(from != "OUTSIDE CHICAGO", to != "OUTSIDE CHICAGO") %>%
  # Zero flows don't work with the log scale
  filter(n != 0) %>%
  # Setting amount = 1 makes the y-scale get wacky
  ggplot(aes(x = distance, y = jitter(n, amount = 0.99))) +
  # geom_density2d_filled() +
  geom_point(size = 1, alpha = 0.3) +
  # geom_smooth(se = FALSE, method = "lm", size = 1, color = "steelblue") +
  # scale_x_log10() +
  scale_y_log10() +
  # scale_y_continuous(limits = c(0, 100)) +
  theme_minimal() +
  guides(fill = "none") +
  labs(
    title = "Distance vs. Flow Size",
    x = "Distance (m)",
    y = "Flow Size"
  )

Modeling Commuting Patterns with an ERGM

Modeling this dataset with an exponential random graph model (ERGM) would help understand what factors determine where people work. However, the ERGMs we learned about in class only work with unweighted networks. Krivitsky (2012) extended the ERGM framework to networks with values that represent counts, which is what we have here.

chicago_only <- cca_graph %N>%
  filter(name != "OUTSIDE CHICAGO")

net <- asNetwork(chicago_only)

# Add node covariates
net %v% "residents_log" <- log(chicago_only %N>% pull(h_in_chicago))
net %v% "workers_log" <- log(chicago_only %N>% pull(w_from_chicago))

# Network covariate: geographic distances
# Make a full distance matrix. Recall that all dyads are represented in
# cca_flows, not just non-zero edges.
D <- cca_flows_2022 %>%
  filter(from != "OUTSIDE CHICAGO", to != "OUTSIDE CHICAGO") %>%
  select(from, to, distance) %>%
  pivot_wider(names_from = to, values_from = distance) %>%
  column_to_rownames("from") %>%
  as.matrix()
fit1 <- ergm(
  net ~
    sum + # baseline intensity
    nonzero + # sparsity
    nodeocov("residents_log") + # origin size effect
    nodeicov("workers_log") + # destination size effect
    edgecov(D), # deterrence by distance
  reference = ~Poisson,
  response = "n",
)